Load the required packages:
library(tidyverse)
library(glue)
library(knitr)
library(Matrix)
library(scRNAseq)
library(scater)
library(scran)
library(dittoSeq)
library(reticulate)
library(zellkonverter)Create a conda environment containing the required Python packages:
if (!file.exists(miniconda_path())) {
install_miniconda()
}
conda_install("r-cellrank", c("cellrank-krylov", "python-igraph"),
python_version = "3.8", channel = c("conda-forge", "bioconda"))Miniconda installation path:
miniconda_path()## [1] "/home/rstudio/.local/share/r-miniconda"
List the available conda environments:
conda_list()| name | python |
|---|---|
| 0 | /home/rstudio/.cache/R/basilisk/1.4.0/0/bin/python |
| zellkonverterAnnDataEnv | /home/rstudio/.cache/R/basilisk/1.4.0/zellkonverter/1.2.1/zellkonverterAnnDataEnv/bin/python |
| r-miniconda | /home/rstudio/.local/share/r-miniconda/bin/python |
| r-cellrank | /home/rstudio/.local/share/r-miniconda/envs/r-cellrank/bin/python |
| r-reticulate | /home/rstudio/.local/share/r-miniconda/envs/r-reticulate/bin/python |
Use the created conda environment:
use_condaenv("r-cellrank", required = TRUE)# Load the Hermann et al. (2018) mouse spermatogenic cells dataset
sce <- HermannSpermatogenesisData()
sce## class: SingleCellExperiment
## dim: 54448 2325
## metadata(0):
## assays(2): spliced unspliced
## rownames(54448): ENSMUSG00000102693.1 ENSMUSG00000064842.1 ...
## ENSMUSG00000064369.1 ENSMUSG00000064372.1
## rowData names(0):
## colnames(2325): CCCATACTCCGAAGAG AATCCAGTCATCTGCC ... ATCCACCCACCACCAG
## ATTGGTGGTTACCGAT
## colData names(1): celltype
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
# Fix row names
rownames(sce) <- str_remove(rownames(sce), ".\\d+$")
# Cell metadata
colData(sce)## DataFrame with 2325 rows and 1 column
## celltype
## <character>
## CCCATACTCCGAAGAG DIplotene/Secondary ..
## AATCCAGTCATCTGCC DIplotene/Secondary ..
## GACTGCGGTTTGACTG DIplotene/Secondary ..
## ACACCAATCTTCGGTC DIplotene/Secondary ..
## TGACAACAGGACAGAA Mid Round spermatids
## ... ...
## CCCTCCTCATCGGACC NA
## GAAACTCCACTATCTT NA
## AACGTTGTCATCGATG NA
## ATCCACCCACCACCAG NA
## ATTGGTGGTTACCGAT NA
# Cell type summary
sce$celltype %>%
fct_explicit_na(na_level = "NA") %>%
table() %>%
enframe() %>%
arrange(desc(value))| name | value |
|---|---|
| Mid Round spermatids | 878 |
| NA | 447 |
| Late Round spermatids | 423 |
| Early Round spermatids | 410 |
| DIplotene/Secondary spermatocytes | 136 |
| Pachytene spermatocytes | 10 |
| A3-A4-In-B Differentiating spermatogonia | 7 |
| Leptotene/Zygotene spermatocytes | 7 |
| Preleptotene spermatocytes | 3 |
| Sertoli cells | 2 |
| A1-A2 Differentiating spermatogonia | 1 |
| Undifferentiated spermatogonia | 1 |
# Prepare the cell type labels
sce$cell_type <- sce$celltype %>%
fct_explicit_na(na_level = "Other") %>%
str_remove(" spermatids") %>%
fct_lump_min(min = 400, other_level = "Other") %>%
fct_relevel("Early Round", "Mid Round", "Late Round", "Other") %>%
fct_drop()
table(sce$cell_type)##
## Early Round Mid Round Late Round Other
## 410 878 423 614
# Set the RNG seed for reproducible results
set.seed(42)
# Data normalization
quick_clusters <- quickCluster(sce, assay.type = "spliced")
table(quick_clusters)## quick_clusters
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 133 139 244 254 176 117 196 177 157 201 213 197 121
sce <- computeSumFactors(sce, clusters = quick_clusters, assay.type = "spliced")
sce <- logNormCounts(sce, assay.type = "spliced")
# Selecting highly variable genes
dec <- modelGeneVar(sce)
top_hvgs <- getTopHVGs(dec, fdr.threshold = 0.05)
length(top_hvgs)## [1] 975
# Principal component analysis
sce <- denoisePCA(sce, dec, subset.row = top_hvgs, min.rank = 5, max.rank = 50)
ncol(reducedDim(sce, "PCA"))## [1] 5
# Dimensionality reduction by UMAP
sce <- runUMAP(sce, dimred = "PCA")dittoDimPlot(sce, "cell_type", reduction.use = "UMAP", size = 0.5,
legend.show = FALSE, do.label = TRUE,
labels.highlight = FALSE, labels.size = 4)dittoDimPlot(sce, "cell_type", reduction.use = "UMAP", size = 0.1,
split.by = "cell_type", split.ncol = 2, legend.show = FALSE)sce_adata <- SingleCellExperiment(
assays = list(
counts = assay(sce, "spliced"),
spliced = assay(sce, "spliced"),
unspliced = assay(sce, "unspliced")
),
colData = DataFrame(
cell_type = sce$cell_type
),
reducedDims = list(
X_umap = reducedDim(sce, "UMAP")
)
)
sce## class: SingleCellExperiment
## dim: 54448 2325
## metadata(0):
## assays(3): spliced unspliced logcounts
## rownames(54448): ENSMUSG00000102693 ENSMUSG00000064842 ...
## ENSMUSG00000064369 ENSMUSG00000064372
## rowData names(0):
## colnames(2325): CCCATACTCCGAAGAG AATCCAGTCATCTGCC ... ATCCACCCACCACCAG
## ATTGGTGGTTACCGAT
## colData names(3): celltype cell_type sizeFactor
## reducedDimNames(2): PCA UMAP
## mainExpName: NULL
## altExpNames(0):
adata <- SCE2AnnData(sce_adata)
adata## AnnData object with n_obs × n_vars = 2325 × 54448
## obs: 'cell_type'
## uns: 'X_name'
## obsm: 'X_umap'
## layers: 'spliced', 'unspliced'
Load the required Python modules:
import scanpy as sc
import scvelo as scv
import cellrank as cr
import matplotlib.pyplot as plt
scv.settings.verbosity = 0
cr.settings.verbosity = 0Get the anndata object:
adata = r.adataPie chart of spliced/unspliced proportions:
scv.pl.proportions(adata, groupby='cell_type')Preprocess the data:
scv.pp.filter_and_normalize(adata, n_top_genes=2000, enforce=True)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)RNA velocity analysis using the dynamical model:
scv.tl.recover_dynamics(adata, n_jobs = 4)scv.tl.velocity(adata, mode='dynamical')
scv.tl.velocity_graph(adata)Stream plot of velocities on the UMAP embedding:
scv.pl.velocity_embedding_stream(adata, basis='umap', color='cell_type')scv.tl.velocity_confidence(adata)
scv.pl.scatter(adata, basis='umap', color='velocity_length',
cmap='coolwarm', perc=[5, 95])scv.pl.scatter(adata, basis='umap', color='velocity_confidence',
cmap='coolwarm', perc=[5, 95])scv.tl.velocity_pseudotime(adata)
scv.pl.velocity_graph(adata, basis='umap', color='cell_type', threshold=0.1,
legend_loc='best')scv.pl.scatter(adata, basis='umap', color='velocity_pseudotime',
cmap='gnuplot')scv.tl.paga(adata, groups='cell_type')
scv.pl.paga(adata, basis='umap', size=50, alpha=0.1, min_edge_width=2,
node_size_scale=1.5, legend_loc='best')scv.tl.rank_velocity_genes(adata, groupby='cell_type', min_corr=0.3)
diff_velo_df = scv.DataFrame(adata.uns['rank_velocity_genes']['names'])head(py$diff_velo_df)| Early Round | Mid Round | Late Round | Other |
|---|---|---|---|
| ENSMUSG00000054910 | ENSMUSG00000096405 | ENSMUSG00000087524 | ENSMUSG00000031770 |
| ENSMUSG00000028701 | ENSMUSG00000086732 | ENSMUSG00000040446 | ENSMUSG00000061607 |
| ENSMUSG00000028248 | ENSMUSG00000109981 | ENSMUSG00000024500 | ENSMUSG00000028132 |
| ENSMUSG00000110611 | ENSMUSG00000097325 | ENSMUSG00000099684 | ENSMUSG00000020328 |
| ENSMUSG00000097392 | ENSMUSG00000034040 | ENSMUSG00000104111 | ENSMUSG00000022462 |
| ENSMUSG00000100684 | ENSMUSG00000030276 | ENSMUSG00000110629 | ENSMUSG00000026743 |
scv.pl.velocity(adata, diff_velo_df.iloc[0, :], color='cell_type', ncols = 1)adata## AnnData object with n_obs × n_vars = 2325 × 2000
## obs: 'cell_type', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size', 'n_counts', 'velocity_self_transition', 'velocity_length', 'velocity_confidence', 'velocity_confidence_transition', 'root_cells', 'end_points', 'velocity_pseudotime'
## var: 'gene_count_corr', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'fit_r2', 'fit_alpha', 'fit_beta', 'fit_gamma', 'fit_t_', 'fit_scaling', 'fit_std_u', 'fit_std_s', 'fit_likelihood', 'fit_u0', 'fit_s0', 'fit_pval_steady', 'fit_steady_u', 'fit_steady_s', 'fit_variance', 'fit_alignment_scaling', 'velocity_genes', 'spearmans_score', 'velocity_score'
## uns: 'X_name', 'pca', 'neighbors', 'recover_dynamics', 'velocity_params', 'velocity_graph', 'velocity_graph_neg', 'cell_type_colors', 'paga', 'cell_type_sizes', 'rank_velocity_genes'
## obsm: 'X_umap', 'X_pca', 'velocity_umap'
## varm: 'PCs', 'loss'
## layers: 'spliced', 'unspliced', 'Ms', 'Mu', 'fit_t', 'fit_tau', 'fit_tau_', 'velocity', 'velocity_u'
## obsp: 'distances', 'connectivities'
Calculating the transition matrix:
vtk = cr.tl.kernels.VelocityKernel(adata)
vtk.compute_transition_matrix()Plot the UMAP projection of the transition matrix:
vtk.compute_projection(basis='umap')
scv.pl.velocity_embedding_stream(adata, basis='umap', color='cell_type',
vkey='T_fwd')Computing Schur decomposition:
gpcaa = cr.tl.estimators.GPCCA(vtk)
gpcaa.compute_schur(n_components=10)Top eigenvalues plot:
gpcaa.plot_spectrum(real_only=True)
plt.show()Computing macrostates:
gpcaa.compute_macrostates(n_states=2, cluster_key='cell_type')
gpcaa.plot_macrostates()Computing terminal states:
gpcaa.compute_terminal_states()
gpcaa.plot_terminal_states()Calculating absorption probabilities:
gpcaa.compute_absorption_probabilities()Plot the absorption probabilities:
gpcaa.plot_absorption_probabilities(legend_loc='best')gpcaa.plot_absorption_probabilities(same_plot=False, ncols=1)adata## AnnData object with n_obs × n_vars = 2325 × 2000
## obs: 'cell_type', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size', 'n_counts', 'velocity_self_transition', 'velocity_length', 'velocity_confidence', 'velocity_confidence_transition', 'root_cells', 'end_points', 'velocity_pseudotime', 'clusters_gradients', 'terminal_states', 'terminal_states_probs'
## var: 'gene_count_corr', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'fit_r2', 'fit_alpha', 'fit_beta', 'fit_gamma', 'fit_t_', 'fit_scaling', 'fit_std_u', 'fit_std_s', 'fit_likelihood', 'fit_u0', 'fit_s0', 'fit_pval_steady', 'fit_steady_u', 'fit_steady_s', 'fit_variance', 'fit_alignment_scaling', 'velocity_genes', 'spearmans_score', 'velocity_score'
## uns: 'X_name', 'pca', 'neighbors', 'recover_dynamics', 'velocity_params', 'velocity_graph', 'velocity_graph_neg', 'cell_type_colors', 'paga', 'cell_type_sizes', 'rank_velocity_genes', 'T_fwd_params', 'schur_matrix_fwd', 'eigendecomposition_fwd', 'coarse_fwd', 'clusters_gradients_colors', 'terminal_states_colors'
## obsm: 'X_umap', 'X_pca', 'velocity_umap', 'T_fwd_umap', 'schur_vectors_fwd', 'terminal_states_memberships', 'to_terminal_states'
## varm: 'PCs', 'loss'
## layers: 'spliced', 'unspliced', 'Ms', 'Mu', 'fit_t', 'fit_tau', 'fit_tau_', 'velocity', 'velocity_u'
## obsp: 'distances', 'connectivities'
adata.uns['terminal_states_names'] = adata.obsm['to_terminal_states'].names
del adata.uns['coarse_fwd']
adata.write('rna_velocity_scvelo_hermann_2018.h5ad', compression='gzip')Read the h5ad file:
sce_cellrank <- readH5AD("rna_velocity_scvelo_hermann_2018.h5ad")
sce_cellrank## class: SingleCellExperiment
## dim: 2000 2325
## metadata(15): T_fwd_params cell_type_colors ... velocity_graph_neg
## velocity_params
## assays(10): counts Ms ... velocity velocity_u
## rownames(2000): ENSMUSG00000025903 ENSMUSG00000097171 ...
## ENSMUSG00000064367 ENSMUSG00000064370
## rowData names(25): gene_count_corr means ... velocity_score varm
## colnames(2325): CCCATACTCCGAAGAG AATCCAGTCATCTGCC ... ATCCACCCACCACCAG
## ATTGGTGGTTACCGAT
## colData names(15): cell_type initial_size_spliced ... terminal_states
## terminal_states_probs
## reducedDimNames(7): T_fwd_umap X_pca ... to_terminal_states
## velocity_umap
## mainExpName: NULL
## altExpNames(0):
Get terminal state absorption probabilities for each cell:
absorption_probabilities <- setNames(
as.data.frame(reducedDim(sce_cellrank, "to_terminal_states")),
metadata(sce_cellrank)$terminal_states_names
)
head(absorption_probabilities)| Late Round | Other | |
|---|---|---|
| CCCATACTCCGAAGAG | 0.7484622 | 0.2515375 |
| AATCCAGTCATCTGCC | 0.7509432 | 0.2490566 |
| GACTGCGGTTTGACTG | 0.7514131 | 0.2485865 |
| ACACCAATCTTCGGTC | 0.7471087 | 0.2528910 |
| TGACAACAGGACAGAA | 0.7650655 | 0.2349341 |
| TTGGAACAGGCGTACA | 0.7670970 | 0.2329027 |
sessionInfo()## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] zellkonverter_1.2.1 reticulate_1.22
## [3] dittoSeq_1.4.3 scran_1.20.1
## [5] scater_1.20.1 scuttle_1.2.1
## [7] scRNAseq_2.6.1 SingleCellExperiment_1.14.1
## [9] SummarizedExperiment_1.22.0 Biobase_2.52.0
## [11] GenomicRanges_1.44.0 GenomeInfoDb_1.28.4
## [13] IRanges_2.26.0 S4Vectors_0.30.2
## [15] BiocGenerics_0.38.0 MatrixGenerics_1.4.3
## [17] matrixStats_0.61.0 Matrix_1.3-4
## [19] knitr_1.36 glue_1.4.2
## [21] forcats_0.5.1 stringr_1.4.0
## [23] dplyr_1.0.7 purrr_0.3.4
## [25] readr_2.0.2 tidyr_1.1.4
## [27] tibble_3.1.5 ggplot2_3.3.5
## [29] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 tidyselect_1.1.1
## [3] RSQLite_2.2.8 AnnotationDbi_1.54.1
## [5] grid_4.1.0 BiocParallel_1.26.2
## [7] munsell_0.5.0 ScaledMatrix_1.0.0
## [9] statmod_1.4.36 withr_2.4.2
## [11] colorspace_2.0-2 filelock_1.0.2
## [13] highr_0.9 rstudioapi_0.13
## [15] labeling_0.4.2 GenomeInfoDbData_1.2.6
## [17] bit64_4.0.5 farver_2.1.0
## [19] pheatmap_1.0.12 rprojroot_2.0.2
## [21] basilisk_1.4.0 vctrs_0.3.8
## [23] generics_0.1.0 xfun_0.26
## [25] BiocFileCache_2.0.0 R6_2.5.1
## [27] ggbeeswarm_0.6.0 rsvd_1.0.5
## [29] locfit_1.5-9.4 AnnotationFilter_1.16.0
## [31] bitops_1.0-7 cachem_1.0.6
## [33] DelayedArray_0.18.0 assertthat_0.2.1
## [35] promises_1.2.0.1 BiocIO_1.2.0
## [37] scales_1.1.1 beeswarm_0.4.0
## [39] gtable_0.3.0 beachmat_2.8.1
## [41] ensembldb_2.16.4 rlang_0.4.11
## [43] rtracklayer_1.52.1 lazyeval_0.2.2
## [45] broom_0.7.9 BiocManager_1.30.16
## [47] yaml_2.2.1 modelr_0.1.8
## [49] GenomicFeatures_1.44.2 backports_1.2.1
## [51] httpuv_1.6.3 tools_4.1.0
## [53] ellipsis_0.3.2 jquerylib_0.1.4
## [55] RColorBrewer_1.1-2 ggridges_0.5.3
## [57] Rcpp_1.0.7 plyr_1.8.6
## [59] sparseMatrixStats_1.4.2 progress_1.2.2
## [61] zlibbioc_1.38.0 RCurl_1.98-1.5
## [63] basilisk.utils_1.4.0 prettyunits_1.1.1
## [65] viridis_0.6.1 cowplot_1.1.1
## [67] haven_2.4.3 ggrepel_0.9.1
## [69] cluster_2.1.2 fs_1.5.0
## [71] here_1.0.1 magrittr_2.0.1
## [73] reprex_2.0.1 ProtGenerics_1.24.0
## [75] hms_1.1.1 mime_0.12
## [77] evaluate_0.14 xtable_1.8-4
## [79] XML_3.99-0.8 readxl_1.3.1
## [81] gridExtra_2.3 compiler_4.1.0
## [83] biomaRt_2.48.3 crayon_1.4.1
## [85] htmltools_0.5.2 later_1.3.0
## [87] tzdb_0.1.2 lubridate_1.8.0
## [89] DBI_1.1.1 ExperimentHub_2.0.0
## [91] dbplyr_2.1.1 rappdirs_0.3.3
## [93] cli_3.0.1 metapod_1.0.0
## [95] igraph_1.2.6 pkgconfig_2.0.3
## [97] GenomicAlignments_1.28.0 dir.expiry_1.0.0
## [99] xml2_1.3.2 vipor_0.4.5
## [101] bslib_0.3.1 dqrng_0.3.0
## [103] XVector_0.32.0 rvest_1.0.1
## [105] digest_0.6.28 Biostrings_2.60.2
## [107] rmarkdown_2.11 cellranger_1.1.0
## [109] edgeR_3.34.1 DelayedMatrixStats_1.14.3
## [111] restfulr_0.0.13 curl_4.3.2
## [113] shiny_1.7.1 Rsamtools_2.8.0
## [115] rjson_0.2.20 lifecycle_1.0.1
## [117] jsonlite_1.7.2 BiocNeighbors_1.10.0
## [119] viridisLite_0.4.0 limma_3.48.3
## [121] fansi_0.5.0 pillar_1.6.3
## [123] lattice_0.20-44 KEGGREST_1.32.0
## [125] fastmap_1.1.0 httr_1.4.2
## [127] interactiveDisplayBase_1.30.0 png_0.1-7
## [129] bluster_1.2.1 BiocVersion_3.13.1
## [131] bit_4.0.4 stringi_1.7.5
## [133] sass_0.4.0 blob_1.2.2
## [135] BiocSingular_1.8.1 AnnotationHub_3.0.1
## [137] memoise_2.0.0 irlba_2.3.3